library(tidyverse)
library(ggpubr)
devtools::load_all()
set.seed(44)
library(RColorBrewer)

Simulate the data

ncells <- 20000 # Total number of cells
nmarkers <- 20 # Number of markers
beta_size <- 8 # Sample differnce
sigma_alpha <- 3 # Size of batch effects
sigma_epsilon <- 4 # Size of random noise
sigma_biology <- 8 # Size of inherent biology
nbioclust <- 4

# Sample (treatment/control) design matrix
X <- matrix(c(rep(0,ncells/2), rep(1,ncells/2)), ncells, 1)

# Inherent biology
# Note: the construction ensures that Z and X are uncorrelated. 
Z <- matrix(0, ncol = nbioclust, nrow = ncells)
for(i in 1:nbioclust){
  free <- which(rowSums(Z) != 1)
  Z[sample(free, ncells/nbioclust), i] <- 1
}

# Coefficeints of biology
# Select certain coefficeints and make them non-zero
# gamma_size <- 5
# gamma <- matrix(0, 1, nmarkers)
# gamma[c(1,3,5)] <- rep(gamma_size, 3)

# Randomly sample all coeffcients from N(0, sigma_biology) 
gamma <- matrix(rnorm(nmarkers*nbioclust, sd = sigma_biology), nbioclust, nmarkers)

# Global mean (similair overall shift to CyTOF data)
mu <- matrix(10, ncells, nmarkers)

# Coefficients of interest
beta <- matrix(0, 1, nmarkers)
beta[c(1,3,5)] <- rep(beta_size, 3)

# Unwanted variation design matrix
# Note: This is UNCORRELATED with X
W <- matrix(rbinom(ncells, 1, 0.5), ncells, 1)

# Coeffcients of unwanted variation
alpha <- matrix(rnorm(nmarkers, sd = sigma_alpha), 1, nmarkers)

# Random error
epsilon <- matrix(rnorm(ncells*nmarkers, sd = sigma_epsilon), ncells, nmarkers)

# RUVIII model
Y <-  mu + X %*% beta + Z %*% gamma + W %*% alpha + epsilon

Annotate the simulation data

colnames(Y) <- paste0("Marker", 1:nmarkers)

sample_id <- rep(LETTERS[1:2], each = ncells/2)

cluster_id <- rep(NA, ncells) # Add nonsense to make correct format

data <- data.frame(sample = sample_id, cluster = cluster_id, Y)

data$cluster <- cluster_FlowSOM(data, 5)

bio_ind <- numeric(nrow(Z))
for(i in 1:ncol(Z)){
  bio_ind[which(Z[ ,i] == 1)] <- i
}
bio_ind <- as.matrix(bio_ind, nrow = ncells, ncol = 1)

Plot markers

plot_marker_boxplots(data)

plot_marker_boxplots_samp(data)

PCA plots

pca <- prcomp(as.matrix(data[, 3:ncol(data)]))

# Look at to what extent the pca vectors captures batch effect
plots <- list(qplot(1:ncells, pca$x[,1]) + labs(x = "Index", y = "PCA1") + theme_bw(),
              qplot(1:ncells, pca$x[,2]) + labs(x = "Index", y = "PCA2") + theme_bw(),
              qplot(1:ncells, pca$x[,3]) + labs(x = "Index", y = "PCA3") + theme_bw(),
              qplot(1:ncells, pca$x[,4]) + labs(x = "Index", y = "PCA4") + theme_bw())

ggarrange(plotlist = plots)

Coloured by sample

ggarrange(plotlist = plot_scpca_samp(data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)

Coloured by batch

ggarrange(plotlist = plot_scpca_ann(data, 2000, annotation = W, ann_label = "Batch"), 
          nrow = 1, ncol = 3, common.legend = TRUE)

Colour by biology

ggarrange(plotlist = plot_scpca_ann(data, 2000, annotation = bio_ind, ann_label = "Biology"), 
          nrow = 1, ncol = 3, common.legend = TRUE)

Colour by cluster

ggarrange(plotlist = plot_scpca_clus(data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)

tSNE plots

samp <- sample(1:nrow(data), 1000)
tsne_data <- compute_tsne(data, sample = samp)
## Warning: no DISPLAY variable so Tk is not available
##   Running t-SNE...with seed 42  DONE

tSNE coloured by sample

plot_tsne_sample(tsne_data)

tSNE coloured by batch

plot_tsne_ann(tsne_data, W[samp, ], ann_label = "Batch")

tSNE coloured by biology

plot_tsne_ann(tsne_data, bio_ind[samp, ], ann_label = "Biology")

tSNE coloured by cluster

plot_tsne_cluster(tsne_data)

#plot_tsne_marker(data, tsne_data)

Normalise the data

nrep_groups <- 2
rand <- sample(c(0,1), ncells, replace = TRUE)

# Pseudo replciates can either be "rand", "right" or "wrong"
pseudo_rep <- "right"

if(pseudo_rep == "rand"){
  # random pseudo replicates
  M <- cbind(rand, !rand)
} else if (pseudo_rep == "right"){
  # perfect psedo replicates
  M <- cbind(X, !X)
} else if (pseudo_rep == "wrong"){
  # exactly wrong psuedo replicates
  M <- cbind(W, !W)
}

# Create replciates which respect the biology (perfect)
#M <- (cbind(as.numeric(X & Z), (X & !Z), (!X & Z), (!X & !Z)))
M <- Z 

# Create replicates based on clustering
#i.e. define one cluster to be all psedo-replicates
#M <- cbind(as.numeric(data$cluster == 4), !(data$cluster == 4))

# Check ruv documentation condition satisfied
any(rowSums(M) == 0)
## [1] FALSE
# Standardise the cell data
# Could refactor RUVIII wrappers to make this easier
raw_Y <- as.matrix(data[ ,3:ncol(data)])

col_means <- colMeans(raw_Y)
col_sds <- apply(raw_Y, 2, function(x) sd(x))

for(i in 1:ncol(raw_Y)){
  raw_Y[,i] <- (raw_Y[,i] - col_means[i])/col_sds[i]
}

exclude_neg_controls <- c(1,3,5)
controls <- c(1:nmarkers)[-exclude_neg_controls]

RUVIIout <- fastRUVIII(Y = Y, M, ctl = controls, k = 1)
norm_Y <- RUVIIout$newY
alpha <- RUVIIout$fullalpha
  
for(i in 1:ncol(norm_Y)){
  norm_Y[,i] <- norm_Y[,i]*col_sds[i] + col_means[i]
}

norm_data <- data
norm_data[3:ncol(norm_data)] <- norm_Y
#pheatmap::pheatmap(alpha, cluster_cols = FALSE, cluster_rows = FALSE)
#RMY <- Y - M %*% solve(t(M) %*% M) %*% t(M) %*% Y
#pca <- prcomp(RMY)$x
#qplot(pca[,1], pca[,2]) + labs(x = "PCA1", y = "PCA2") + theme_bw()

Plot markers

plot_marker_boxplots(norm_data)

plot_marker_boxplots_samp(norm_data)

PCA plots

ggarrange(plotlist = plot_scpca_samp(norm_data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)

Coloured by batch

ggarrange(plotlist = plot_scpca_ann(norm_data, 2000, annotation = W, ann_label = "Batch"), 
          nrow = 1, ncol = 3, common.legend = TRUE)

Colour by biology

ggarrange(plotlist = plot_scpca_ann(norm_data, 2000, annotation = bio_ind , ann_label = "Biology"), nrow = 1, ncol = 3, common.legend = TRUE)

Colour by cluster

ggarrange(plotlist = plot_scpca_clus(norm_data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)

tSNE plots

samp <- sample(1:nrow(data), 1000)
tsne_data <- compute_tsne(norm_data, sample = samp)
##   Running t-SNE...with seed 42  DONE

tSNE coloured by sample

plot_tsne_sample(tsne_data)

tSNE coloured by batch

plot_tsne_ann(tsne_data, W[samp, ], ann_label = "Batch")

tSNE coloured by biology

plot_tsne_ann(tsne_data, bio_ind[samp, ], ann_label = "Biology")

tSNE coloured by cluster

plot_tsne_cluster(tsne_data)